# coding: utf-8
import numpy as np
import pandas as pd
import xgboost as xgb
from src.utils import predict_xgb, quantile_cut, load_xgb
from joblib import dump, load

####################################################################################
####################################################################################
############################# AREA OF INPUT PARAMETERS #############################
####################################################################################
####################################################################################

##### Environment Parameters

PATH = "." # Root directory, should be the same path this "README.md" file locates
PATH_DATA = f"{PATH}/data" # Path for data
PATH_MODELS = f"{PATH}/models" # Path for models

retrain_model = False # Default False. If retrain_model is True, it will retrain the PD model. If False, it will load the pretrained model
random_seed = 1989 # Random seed, used when retrain_model is True. To reproduce the PD model, set as 1989

####################################################################################
####################################################################################
############# Load the raw data and pretrained model ###############################
####################################################################################
####################################################################################
##### Set random seed
np.random.seed(random_seed)

##### Load raw data ######
dt_raw = pd.read_excel(f"{PATH_DATA}/default of credit card clients.xls",  skiprows=[0]).drop("SEX", axis=1)

##### Load conversion model ######
model_conversion = load(f"{PATH_MODELS}/conversion_model.pkl")

##### Load pretrained PD model ######
if retrain_model is False:
    xgb_pd = load_xgb(f"{PATH_MODELS}/models_pd.model", f"{PATH_MODELS}/models_pd_features.npy")

####################################################################################
####################################################################################
############################## Prepare the raw dataset #############################
####################################################################################
####################################################################################

##### Mapping features
dict_mapping_edu = {0:"OTHERS", 1:"GRAD_SCHOOL", 2:"UNIVERSITY", 3:"HIGH_SCHOOL",
                    4:"OTHERS", 5:"OTHERS", 6:"OTHERS"}

dict_mapping_marriage = {0: "OTHERS", 1:"MARRIED", 2:"SINGLE", 3:"OTHERS"}

dt_raw["EDUCATION"] = dt_raw["EDUCATION"] .apply(lambda x:dict_mapping_edu[x])
dt_raw["MARRIAGE"] = dt_raw["MARRIAGE"] .apply(lambda x:dict_mapping_marriage[x])

##### Simulate PD MODEL and adjustment
var_target = "default payment next month"
var_IDs = ["ID"]
var_model = [x for x in dt_raw.columns if x not in var_IDs + [var_target]]

if retrain_model is True:
    xgb_params = {"eta":0.01, "seed":random_seed, "subsample":0.8, "objective":"reg:logistic",
                  "booster":"gbtree", "nthread":8, "max_depth":3, "min_child_weight":10, "colsample_bytree":1}
    dt_raw_dummy = pd.get_dummies(dt_raw[var_model], prefix_sep="_zl_")
    xgb_data = xgb.DMatrix(data = dt_raw_dummy.values, label=dt_raw[var_target].values,
                           feature_names=list(dt_raw_dummy.columns.values))
    xgb_cv_pd = xgb.cv(params=xgb_params, dtrain=xgb_data, num_boost_round=2000, verbose_eval=1, early_stopping_rounds=100,
                       metrics="logloss")
    best_iter = xgb_cv_pd["test-logloss-mean"].argmin()
    xgb_pd = xgb.train(params=xgb_params, dtrain=xgb_data, num_boost_round=best_iter)
    xgb_pd.save_model(f"{PATH_MODELS}/models_pd.model")
    np.save(f"{PATH_MODELS}/models_pd_features.npy", np.array(dt_raw_dummy.columns.values))


dt_raw["PD"] = np.minimum(0.2, predict_xgb(dt_raw, xgb_pd) * 0.25) # Rescale to make the PD more close to loan request

##### Define the requested amount

dt_raw["AMOUNT"] = np.minimum(dt_raw["LIMIT_BAL"]*0.2, 100000)

var_keep = ["AMOUNT", "EDUCATION", "MARRIAGE", "AGE", "PD"]

dt = dt_raw[var_IDs + var_keep].reset_index(drop = True)

##### Get Risk Score and Set Reference Rate

dict_mapping_risk_score = {"A":[-np.inf, 0.0216], "B":[0.0216, 0.0312], "C":[0.0312, 0.0437],
                           "D":[0.0437, 0.0784], "E":[0.0784, np.inf]}

for key_ in dict_mapping_risk_score.keys():
    index_risk_ = (dt["PD"] > dict_mapping_risk_score[key_][0]) & (dt["PD"] <= dict_mapping_risk_score[key_][1])
    dt.loc[index_risk_, "RISK_SCORE"] = key_

dt["interest_rate"] = np.maximum(np.minimum(dt["PD"] * 0.9, 0.18), 0.01)

##### Create clusters for Amount and Age
for var_ in ["AMOUNT", "AGE"]:
    quantile_cut(dt, var_, bin=5)

##### Normalize the get reward and cost2, and filter database
max_amount = dt["AMOUNT"].max()
dt["amount_norm"] = dt["AMOUNT"]/max_amount

dt["discount_base_norm"] = dt["AMOUNT"] * dt["interest_rate"]
dt = dt[dt["discount_base_norm"] < 10000].reset_index(drop = True)

max_discount_base_norm = (dt["AMOUNT"]*dt["interest_rate"]).max()
dt["discount_base_norm"] /= max_discount_base_norm

##### Get the context index - Mapping context to index
var_context = ["RISK_SCORE", "AMOUNT", "EDUCATION", "MARRIAGE", "AGE", "interest_rate", "AMOUNT_CLUSTER", "AGE_CLUSTER"]
context_list = dt[var_context].drop_duplicates().reset_index(drop = True)
context_list["index_context"] = context_list.index.values + 1
dt = dt.merge(context_list, how = "left", on = var_context).reset_index(drop = True)

####################################################################################
####################################################################################
############################## Export the databset #################################
####################################################################################
####################################################################################

var_interest = "interest_rate"
var_context_base = ["RISK_SCORE", "AMOUNT", "EDUCATION", "MARRIAGE", "AGE"]
var_id = "ID"

dt[[var_id, "index_context"] + var_context_base + [var_interest] + \
   ["AMOUNT_CLUSTER", "AGE_CLUSTER", "amount_norm", "discount_base_norm"]].to_parquet(f"{PATH_DATA}/dt_env.parq")


####################################################################################
####################################################################################
#################### Visualize the coefficient of Conversion Model #################
####################################################################################
####################################################################################

var_conversion = ["interest_rate", "RISK_SCORE", "EDUCATION", "MARRIAGE", "AMOUNT_CLUSTER", "AGE_CLUSTER"]
dt_dummy = pd.get_dummies(dt[var_conversion], prefix_sep="_zl_")

coef_table = pd.DataFrame.from_dict({"Name":list(dt_dummy.columns.values), "coef":list(model_conversion.coef_[0])})
coef_table["coef"] = np.round(coef_table["coef"], 4)
print(np.round(model_conversion.intercept_, 4))
print(coef_table)